- Open source
- Collaborative
- Great documentation
- "Fast and furious"
rIsCool=TRUE
if(rIsCool){
print("Hell yes, r is pretty cool")
}
## [1] "Hell yes, r is pretty cool"
rIsCool=TRUE
rSucks=FALSE
if(rIsCool && rSucks ){ # here we use the and operator
print("nope")
}else{
print("Someone is lying")
}
## [1] "Someone is lying"
if(rIsCool || rSucks ){
print("nope")
}else{
print("My very first or")
}
## [1] "nope"
rIsCool=TRUE
rSucks=FALSE
if( rSucks ){
print("nope")
}else if(rIsCool){
print("where the heck is elif")
}
## [1] "where the heck is elif"
Lets make a loop that goes from 1 to 10 printing only the prime numbers.
primes=c(2,3,5,7)# This is how we concatenate variables, the c function (we'll see it later)
i=1
while(i<10){
if( i%in%primes ){ #the command %in% could be really useful
print(i)
}
i=i+1
}
## [1] 2 ## [1] 3 ## [1] 5 ## [1] 7
Lets make a loop that goes from 1 to 10 printing only the prime numbers.
primes=c(2,3,5,7)# This is how we concatenate variables, the c function (we'll see it later)
i=1
for(i in 1:10){
if( i%in%primes ){ #the command %in% could be really useful
print(i)
}
}
## [1] 2 ## [1] 3 ## [1] 5 ## [1] 7
Let's make the sum function
MyFirstTime=function(a,b){
res=a+b
return(res)
}
a=42
b=54
print(MyFirstTime(a,b))
## [1] 96
print(a+b)
## [1] 96
Let's make the sum function.
But… if we don't define the second argument it should be 11
MyFirstTimeLikeAPro=function(a,b=11){
res=a+b
return(res)
}
a=42
print(MyFirstTimeLikeAPro(a))
## [1] 53
Let's make the sum function.
But… if we don't define the second argument it should be a random number (use the command floor(runif(1)*10)).
Return the b argument in case that it was not defined (aditional to the sum)
ICanHandle2=function(a,b=NULL){
if(is.null(b)){ #another useful function: is.null
b=floor(runif(1)*10)
res=a+b
res2=list(sum=res,b=b) #You can return also c(res,b) but using lists is a good option
return(res2)
}else{
res=a+b
return(res)
}
}
a=42 r=ICanHandle2(a) print(r$sum)
## [1] 43
print(r$b)
## [1] 1
An array is a variable were we can storage any kind of variables (they must be the same type, for instance, strings or integers).
myArray=c(var1,var2) myArray2= c(myArray,var3)
if we want to use a variable inside an array we use the variable index (R index from 1)
var1=42 var2="Hola" var3=52 arr1=c(var1,var2) # This works... but var1 became a string arr2=c(var1,var3) # Both are numbers, and they stay as numbers
A matrix is a variable were we can storage any kind of variables (they do have to be the same type: we can storage, for instance, strings and integers) using a double index.
myMatrix=matrix(array, ncol=n , byrow=T)
The idea here is that we fill a matrix of n columns using with the data in the array. We fill the matrix row by row if we define byrow=T.
if we want to use a variable inside an matrix we use a double variable index [row,column]
array=c("hola","mundo","hello","world")
m1=matrix(array,ncol=2)
array2=c(1,2,3,4)
m2=matrix(array2,ncol=2,byrow = T)
print(m2)
## [,1] [,2] ## [1,] 1 2 ## [2,] 3 4
The last 2 functions apply only for numeric matrix. Be careful with dimension problems!!!
array=c(3,2) m1=matrix(array,ncol=2) #dim 1x2 array2=c(1,2,3,4) m2=matrix(array2,ncol=2,byrow = T) #dim 2x2 #m3=cbind(m1,m2) Dimensional problem! m3=rbind(m1,m2) print(m3)
## [,1] [,2] ## [1,] 3 2 ## [2,] 1 2 ## [3,] 3 4
array=c(3,2) m1=matrix(array,ncol=2) #dim 1x2 array2=c(1,2,3,4) m2=matrix(array2,ncol=2,byrow = T) #dim 2x2 #m3=m2 %*% m1 Dimensional problem! m3=(m2+m2) %*% t(m1) print(m3)
## [,1] ## [1,] 14 ## [2,] 34
R has a lot of contributors, the CRAN has a lot of extra packages that allow us to do a lot of extra things!! (with little extra effort)
A list is a variable were we can storage any kind of variables (they do not have to be the same type: we can storage, for instance, strings and integers).
mylist=list(name1=var1,name2=var2)
if we want to use a variable inside a list we use the $ operator, or access with its index[[index]]
var1=42 var2="Hola" myList=list(name1=var1,name2=var2) print(myList)
## $name1 ## [1] 42 ## ## $name2 ## [1] "Hola"
A data frame is used for storing data tables. It is a list of arrays of equal length
DF=data.frame(name1=arr1,name2=arr2)
if we want to use an array inside a data frame we use the $ operator
array=c("hola","mundo","hello","world")
array2=c(1,2,3,4)
DF=data.frame(name1=array,name2=array2)
print(DF)
## name1 name2 ## 1 hola 1 ## 2 mundo 2 ## 3 hello 3 ## 4 world 4
We can apply almost every function of a matrix here (not the ones design for numerical matrix)
MAGIC!!
See the dplyr package: Grammar of data Manipulation. Some important functions are:
Here we select which columns we want to use.
iris%>% select(Petal.Width,Species)%>% head()
## Petal.Width Species ## 1 0.2 setosa ## 2 0.2 setosa ## 3 0.2 setosa ## 4 0.2 setosa ## 5 0.2 setosa ## 6 0.4 setosa
Add a column to a Data Frame
iris%>% mutate(new_column="hi!")%>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species new_column ## 1 5.1 3.5 1.4 0.2 setosa hi! ## 2 4.9 3.0 1.4 0.2 setosa hi! ## 3 4.7 3.2 1.3 0.2 setosa hi! ## 4 4.6 3.1 1.5 0.2 setosa hi! ## 5 5.0 3.6 1.4 0.2 setosa hi! ## 6 5.4 3.9 1.7 0.4 setosa hi!
Get the summary of a data frame i.e mean, median, min
iris%>%
summarise(mean_Sepal.Length=mean(Sepal.Length),
min_Petal.Width=min(Petal.Width))
## mean_Sepal.Length min_Petal.Width ## 1 5.843333 0.1
This functions is like…
What if we want to know which is the mean of the Petal.Width but for EACH Species?
iris%>% group_by(Species)%>% summarise(mean_Petal.Width=mean(Petal.Width))
## # A tibble: 3 x 2 ## Species mean_Petal.Width ## <fctr> <dbl> ## 1 setosa 0.246 ## 2 versicolor 1.326 ## 3 virginica 2.026
What if we want to know which is the mean of the Petal.Width but for EACH Species?
iris%>%group_by(Species)%>% summarise(mean_Petal.Width=mean(Petal.Width))
## # A tibble: 3 x 2 ## Species mean_Petal.Width ## <fctr> <dbl> ## 1 setosa 0.246 ## 2 versicolor 1.326 ## 3 virginica 2.026
It is like vlookup in excel (EWWW excel). Here we merge two data frame according to a column key.
Let's try to put the mean_Petal.Width in each row of the original data frame.
mean.df=iris%>%group_by(Species)%>% summarise(mean_Petal.Width=mean(Petal.Width)) iris%>%left_join(mean.df,by="Species")%>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3.0 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5.0 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## mean_Petal.Width ## 1 0.246 ## 2 0.246 ## 3 0.246 ## 4 0.246 ## 5 0.246 ## 6 0.246
Here we can filter a data frame according to a logical condition. Let's try to get a data frame with only the rows where the Sepal.Width is greatter than 3.
iris%>% filter(Sepal.Width>3)%>% head()
## Warning: package 'bindrcpp' was built under R version 3.3.2
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.7 3.2 1.3 0.2 setosa ## 3 4.6 3.1 1.5 0.2 setosa ## 4 5.0 3.6 1.4 0.2 setosa ## 5 5.4 3.9 1.7 0.4 setosa ## 6 4.6 3.4 1.4 0.3 setosa
Here we can filter a data frame according to a logical condition. Let's try to get a data frame with only the rows where the Sepal.Width is greatter than 3 and Sepal.Length lesser than 4.7
iris%>% filter(Sepal.Width>3&Sepal.Length<4.7)%>% ### Watch out! here we use just one & not double head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## 1 4.6 3.1 1.5 0.2 setosa ## 2 4.6 3.4 1.4 0.3 setosa ## 3 4.6 3.6 1.0 0.2 setosa ## 4 4.4 3.2 1.3 0.2 setosa ## 5 4.6 3.2 1.4 0.2 setosa
What is the difference between filter and select?
Let's calculate the mean of Petal.Length by species…
But first we have to filter our data so that we work just in the species versicolor and virginica.
We also have to work only with the flowers that have a larger Sepal.Width that the median of its species.
medianDF=iris%>%
group_by(Species)%>%
summarise(mediana=median(Sepal.Width))
iris%>%
left_join(medianDF,by="Species")%>% ### Watch out! here we use just one & not double
filter(Sepal.Width>mediana&Species%in%c('versicolor' ,'virginica'))%>%
group_by(Species)%>%
summarise(mean= mean(Petal.Length))
## # A tibble: 2 x 2 ## Species mean ## <fctr> <dbl> ## 1 versicolor 4.473913 ## 2 virginica 5.717647
The command is:
write.csv(file,sep,row.names=1, header=T, stringAsFactors=F)
this command import the data in a dataframe variable
data=read.csv("data/ejemplo.csv",header=T
,stringsAsFactors = FALSE)
print(head(data))
## var1 var2 var3 var4 var5 ## 1 3.7291 0.6975 3.0196 10.4236 30.7021 ## 2 0.3684 0.9678 3.1390 10.7369 40.7992 ## 3 3.3425 1.9251 3.5338 10.9021 13.6980 ## 4 0.7367 0.7886 3.8229 10.0374 35.0679 ## 5 3.2674 0.5189 3.8438 10.3640 15.4337 ## 6 1.7310 0.6432 3.8834 10.0204 26.7014
Lets suppose we have some variable in an array.
Can we make a fuctions which takes an array and return a list with its mean, variance, minimum, and the 1 Qtr?
arr=data$var1
DS=function(arr){
m1=mean(arr)
m2=var(arr)
m3=min(arr)
m4=summary(arr)[2]
mlist=list(mean=m1,var=m2,min=m3,fq=m4)
return(mlist)
}
What if we want to apply our DS function to each row in our dataframe (data)?
Lets try:
apply(data,2,DS)
ans=apply(X,M,function)
Let's try it with a function that return a vector, not a list.
What if we want to apply our DS function to each element of a list of an array?
Lets try:
sapply(data,DS)
See that a dataframe is something like a list of arrays (each column is an array)
sapply return an array. It tries to convert a list to an array if the function return a list.
If someone want to get an array you should use lapply
What is parallel computation?
Let's try a simple function but a lot of times
bigM=bigM=matrix(rnorm(10*1000000),ncol=10) t=Sys.time() r1=apply(bigM,1,mean) Sys.time()-t
## Time difference of 8.436242 secs
Let's try a simple function but a lot of times
bigM=bigM=matrix(rnorm(10*1000000),ncol=10)
cl=makeCluster(detectCores()-1)
var1=42
# clusterExport(cl,list("var1")) # Here we should export any aditional global variable that we use in the function that we are applying
AUX=parApply(cl,bigM,1,mean)
t=Sys.time()
r1=apply(bigM,1,mean)
Sys.time()-t
## Time difference of 8.072358 secs
stopCluster(cl)
Time reduction is not linear and not all procceses should be parallelized!!
Let's try an example to use sapply using parallel computation!
R has native function for plotting duties, but ggplot2 is the easyest way to make amazing plots!!
So far, the command for make a plot is ggplot(data=dataframe,aes(aesthetic mappings))+ geometry
We can make a bast kind of graphics using this package but here we will see just a few of them.
rets=read.csv("data/Retornos.csv",header=T,
stringsAsFactors = FALSE,row.names = 1)
g=ggplot(rets,aes(x="Apple Equity",y=Apple))+geom_boxplot()
g
g=ggplot(rets,aes(x=Apple))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
g=ggplot(rets,aes(x=Nike,y=Apple))+geom_point()
Res=read.csv("data/carrePro.csv",header=T,
stringsAsFactors = FALSE)
g= ggplot(Res,aes(x=Lenguaje,y=Numero,fill=Carrera))+
geom_bar(stat = "identity",position = "dodge")
#try position stack
#change the geometry to geom_col... it is the same
Now we have seen a bit of how does ggplot works (Pretty cool eh?). But what if we want to make a plot of every asset's return over time in a dataframe like rets?
If we want to use ggplot we may want to map time to x, return to y and Asset's name to col.
A lot of information is not how we need it, we should learn how to transform it to how we want it.
So far this transformation is the one that i have used the most since a lot a data is storaged like rets. I use the function gather from the package tidyr.
rets$date=as.Date(rownames(rets))
## Warning in strptime(xx, f <- "%Y-%m-%d", tz = "GMT"): unknown timezone ## 'zone/tz/2018e.1.0/zoneinfo/America/Bogota'
nDF=gather(rets,"Company","Ret",-date) g=ggplot(nDF,aes(x=date,y=Ret,col=Company))+geom_line() #Not pretty, but the important thing is that we undertood the concept, didn't we?
So far this transformation is the one that i have used the most since a lot a data is storaged like rets. I use the function gather from the package tidyr.
This is one of my favorites functions that we can use in ggplot. Have you ever wanted to make the same plot but separating groups? ## Hands on!
Let's make one histogram per species of the Sepal.Length. (Fill it according to its group is a plus)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Let's try to see how does plotly works!, with this package i made the following plot:
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
R is pretty intuitive for making samples from probability distributions!. Almost always the command for this matter is rdistribution name(number of samples,parameters of the distribution). Here we will see 6 basic examples.
\[ \begin{align} min \hspace{2mm} c^T x & \\ A x & \leq b \\ x & \geq 0 \end{align} \]
In R we can use the linprog package. Try the following function
solveLP(c,b,A)
#Example from the package
cvec <- c(1800, 600, 600) # gross margins
names(cvec) <- c("Cows","Bulls","Pigs")
## Constraints (quasi-fix factors)
bvec <- c(40, 90, 2500) # endowment
names(bvec) <- c("Land","Stable","Labor")
## Needs of Production activities
Amat <- rbind( c( 0.7, 0.35, 0 ),
c( 1.5, 1, 3 ),
c( 50, 12.5, 20 ) )
## Maximize the gross margin
ans=solveLP( cvec, bvec, Amat, TRUE )
summary(ans)
## ## ## Results of Linear Programming / Linear Optimization ## ## Objective function (Maximum): 93600 ## ## Solution ## opt ## Cows 44 ## Bulls 24 ## Pigs 0
DF=data.frame("Number"=ans$solution)
DF$Animal=rownames(DF)
ggplot(DF,aes(x=Animal,y=Number,fill=Animal))+
geom_col()
\[ \begin{align} min \hspace{2mm} -d^T x + \frac{1}{2}x^TDx & \\ A^T x & \geq b \end{align} \]
In R we can use the quadprog package. Try the following function
solve.QP(D,d,A,b,meq)
meq is the number of equality constraints (Changing the first meq constraints for equality)
Minimizing the variance of the portfolio, subject to a fully invested-long-only portfolio
\[ \begin{align} min \hspace{2mm} \frac{1}{2}x^T(2 \Sigma)x & \\ \hat{1} x & = 1\\ I^T x & \geq 0 \end{align} \]
#Example from the package
rets=read.csv("data/Retornos.csv",header=T,
stringsAsFactors = FALSE,row.names = 1)
sigma=cov(rets)
nc=ncol(sigma) #number of Assets
I=diag(nc)#this creates an identity matrix
ones=matrix(rep(1,nc),nrow=1)
A=rbind(ones,I)
A=t(A)
b=matrix(c(1,rep(0,nc)),ncol=1)
D=2*sigma
d=matrix(rep(0,nc),ncol=1)
ans=solve.QP(D,d,A,b,meq=1)
tabla=data.frame("Weight"=round(ans$solution,3),"Company"=colnames(rets))
tabla
## Weight Company ## 1 0.000 Apple ## 2 0.000 Nike ## 3 0.000 Facebook ## 4 0.000 Microsoft ## 5 0.225 CocaCola ## 6 0.051 USBancorp ## 7 0.260 P.G ## 8 0.185 Kellogs ## 9 0.280 AT.T
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
ggplot(tabla,aes(x=1,y=Weight,fill=Company))+geom_bar(stat = "identity")+ coord_polar(theta="y")
\[ \begin{align} min \hspace{2mm} f(x) \\ h_1(x) & \geq 0 \\ h_2(x) & = 0 \end{align} \]
Here we can see the nloptr package for a lot of functions and examples, here we'll see just one example using one function from it.
Here we will see how to use the function slsqp which implements a sequential quadratic programing. We can see use it as follows
slsqp(\(x_0\), \(f\), gr , \(h_1\), \(h_1Jac\) , \(h_2\), \(h_2Jac\))
\(.^*\) will be calculated numerically if not specified.
## Solve the Hock-Schittkowski problem no. 100... taken from the package
x0 <- c(1, 2, 0, 4, 0, 1, 1)
f <- function(x) {
(x[1]-10)^2 + 5*(x[2]-12)^2 + x[3]^4 + 3*(x[4]-11)^2 + 10*x[5]^6 +
7*x[6]^2 + x[7]^4 - 4*x[6]*x[7] - 10*x[6] - 8*x[7]
}
h1 <- function(x) {
h <- numeric(4)
h[1] <- 127 - 2*x[1]^2 - 3*x[2]^4 - x[3] - 4*x[4]^2 - 5*x[5]
h[2] <- 282 - 7*x[1] - 3*x[2] - 10*x[3]^2 - x[4] + x[5]
h[3] <- 196 - 23*x[1] - x[2]^2 - 6*x[6]^2 + 8*x[7]
h[4] <- -4*x[1]^2 - x[2]^2 + 3*x[1]*x[2] -2*x[3]^2 - 5*x[6] +11*x[7]
return(h)
}
S1 <- slsqp(x0, fn = f, hin = h1) ## not the best solution h1Jac <- function(x) nl.jacobian(x, h1, heps = 1e-2) #heps is the tolerance S2 <- slsqp(x0, fn = f, hin = h1,hinjac =h1Jac)
Consider the general linear problem defined as:
\[ \begin{align} y= g( x \beta)+ \epsilon \end{align} \] Where \(y\) is the variable of interest, x are the regressor (predictors), \(\beta\) is the vector that we want to estimate and \(\epsilon\) is the error of the model.
\(g\) is often called as the link function and is choosen according to \(y\). Here we will see 3 different examples:
n=100
x1=rnorm(n)
x2=rnorm(n)
y=2+2*x1-3*x2+rnorm(n)
DF=data.frame("y"=y,"x1"=x1,"x2"=x2)
obj=glm(y~.,family = gaussian(link = "identity"),data=DF)#y explained using every other variable in de dataframe
obj2=glm(y~x1+x2,family = gaussian(link = "identity"),data=DF)#Especify the formula
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.090203 0.10349607 20.19596 1.609365e-36 ## x1 1.969285 0.09798135 20.09857 2.351873e-36 ## x2 -2.884046 0.10224087 -28.20834 1.516494e-48
n=100
x1=rnorm(n)
x2=rnorm(n)
y=2+2*x1-3*x2
y = exp(y) # link function
y=rpois(n=n, lambda=y) #Convert to counted data
DF=data.frame("y"=y,"x1"=x1,"x2"=x2)
obj=glm(y~.,family = poisson(),data=DF)
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.995000 0.015872619 125.6882 0 ## x1 2.001795 0.007912869 252.9797 0 ## x2 -3.001157 0.004997280 -600.5582 0
n=100
x1=rnorm(n)
x2=rnorm(n)
y=2+2*x1-3*x2
y = 1/(1+exp(-y)) # link function
y = rbinom(n,1,y) #Convert to binary responce
DF=data.frame("y"=y,"x1"=x1,"x2"=x2)
obj=glm(y~.,family = binomial(link = "logit"),data=DF)
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.741816 0.4777665 3.645747 2.666165e-04 ## x1 2.145282 0.5431396 3.949781 7.822287e-05 ## x2 -3.657464 0.8073121 -4.530421 5.886616e-06
This is one of my favorite packages!, it is called dygraphs.
This is the way to plot when have a Time series
Numeric: numericInput(inputId, label, value) Strings:textInput(inputId, label, value = "") Dates:dateInput(inputId, label, value = NULL) Checkbox: checkboxInput(inputId, label, value = FALSE)
Be careful with the inputId… this is how you get each input in the server!!
variable=input$inputId
Tell the UI that you are expecting an output!!
Tell the server what to do!
Well those are a bit different! we see those plot in the viewer tab, we have to look in the package info to see how to make them work in a shiny app.
Let's try something!
On the UI
radioButtons(inputId, label, c("Normal" = "norm", "Uniform" = "unif", "Log-normal" = "lnorm", "Exponential" = "exp"))
Can i tell a story?
Once upon a time Mateo…
Begin together!
Look for it online!.
It is pretty easy to use in shiny and, believe me, it is really useful to make make table inputs by the user!
Try some fancy tricks!